In [1]:
import numpy as np
import pandas as pd
#load the files
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
data = pd.concat([train, test])
#size of training dataset
train_samples = train.shape[0]
#print some of them
data.head()
Out[1]:
In [2]:
# remove the Id feature, because is not useful for price predictions
data.drop(['Id'],1, inplace=True);
In [3]:
data.info()
In [4]:
print("Size training: {}".format(train.shape[0]))
print("Size testing: {}".format(test.shape[0]))
In [5]:
datanum = data.select_dtypes([np.number])
In [6]:
datanum.columns[datanum.isnull().any()].tolist()
Out[6]:
In [7]:
#number of row without NaN
print(datanum.shape[0] - datanum.dropna().shape[0])
In [8]:
#list of columns with NaN
datanum.columns[datanum.isnull().any()].tolist()
Out[8]:
NaN values will be filled with the mean of the feature they belong. It's a good way to replace NaN values in order to avoid modifying the distribution of the feature but it could cheat the predicted results. We'll assume that.
In [9]:
#Filling with the mean
datanum_no_nan = datanum.fillna(datanum.dropna().mean())
#check
datanum_no_nan.columns[datanum_no_nan.isnull().any()].tolist()
Out[9]:
This process is useful when we are dealing with neural networs. At this moment we don't know what kind of model we are going to implement, so the data will be normalized.
Also the normalization bring us the possibility of managing smaller numbers which improves the speed in the calculations.
In [10]:
import matplotlib.pyplot as plt
%matplotlib inline
# All numeric features except the 'SalePrice'
datanum_no_nan.drop(['SalePrice'], axis=1).head(30).plot(legend=False);
In [11]:
# SalePrice
datanum_no_nan['SalePrice'].head(30).plot(legend=False);
In [12]:
# Showing SalePrice distribution
data.SalePrice.hist(bins=50)
Out[12]:
The 'SalePrice' has a skewed graph. We can stabilize it applying a logarithmic operation because we know that all the values are positive.
In [13]:
# Transforming to non-skewed SalePrice
data.SalePrice = data.SalePrice.apply(np.log)
data.SalePrice.hist(bins=50)
Out[13]:
In [14]:
#Squeeze the data using standard scaler: z = (x - mean)/ std
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
columns = datanum_no_nan.columns.drop('SalePrice')
print("Features: {}".format(columns))
In [15]:
#make a copy
data_norm = datanum_no_nan
data_norm[columns] = scaler.fit_transform(datanum_no_nan[columns])
print("Train shape: {}".format(data_norm.shape))
data_norm.drop(['SalePrice'], axis=1).head(30).plot(legend=False);
In [16]:
# Correlation features
data_norm.corr()['SalePrice'].sort_values(ascending=False).head(10)
Out[16]:
In [17]:
high_corr_feat_names = data_norm.corr()['SalePrice'].sort_values(ascending=False).head(10).axes[0].tolist()
high_corr_feat_names.remove('SalePrice')
data_norm_high_corr = data_norm[high_corr_feat_names]
In [18]:
#heatmap between the most correlated features
import seaborn as sns
fig = plt.figure(figsize=(7, 5))
sns.heatmap(data_norm_high_corr.corr());
In [19]:
#plotting distributions of numeric features
data_norm_high_corr.hist(bins=50, figsize=(22,16));
In [20]:
#Relationships between correlated features
for feature in high_corr_feat_names:
data.plot.scatter(feature, 'SalePrice');
In [21]:
from sklearn.model_selection import KFold
y = np.array(data['SalePrice'])
X = np.array(data_norm_high_corr)
#split by idx
idx = train_samples
X_train, X_test = X[:idx], X[idx:]
y_train, y_test = y[:idx], y[idx:]
print("Shape X train: {}".format(X_train.shape))
print("Shape y train: {}".format(y_train.shape))
print("Shape X test: {}".format(X_test.shape))
print("Shape y test: {}".format(y_test.shape))
kf = KFold(n_splits=3, random_state=9, shuffle=True)
print(kf)
In [22]:
#plotting PCA
from sklearn.decomposition import PCA
def getX_PCA(X):
pca = PCA(n_components=1)
return pca.fit(X).transform(X)
def plotPCA(X, y):
pca = PCA(n_components=1)
X_r = pca.fit(X).transform(X)
plt.plot(X_r, y, 'x')
In [23]:
from sklearn.covariance import EllipticEnvelope
# fit the model
ee = EllipticEnvelope(contamination=0.1,
assume_centered=True,
random_state=9)
ee.fit(X_train)
pred = ee.predict(X_train)
X_train_orig = X_train
y_train_orig = y_train
X_bad = X_train[pred != 1]
y_bad = y_train[pred != 1]
X_train = X_train[pred == 1]
y_train = y_train[pred == 1]
print("Number samples: {}".format(X_train.shape[0]))
#after removing anomalies
plt.scatter(getX_PCA(X_train), y_train)
plt.scatter(getX_PCA(X_bad), y_bad)
Out[23]:
In [24]:
# Get polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = preprocessing.PolynomialFeatures(degree=2)
X_train_orig = poly.fit_transform(X_train_orig)
X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)
In [25]:
# Linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lr = LinearRegression()
#
batch = 0
for train_idx, val_idx in kf.split(X_train, y_train):
X_t, X_v = X_train[train_idx], X_train[val_idx]
y_t, y_v = y_train[train_idx], y_train[val_idx]
#training
lr.fit(X_t, y_t)
#calculate costs
t_error = mean_squared_error(y_t, lr.predict(X_t))**0.5
v_error = mean_squared_error(y_v, lr.predict(X_v))**0.5
print("{}) Training error: {:.2f} Validation error: {:.2f} Score: {:.2f}"
.format(batch, t_error, v_error, lr.score(X_v, y_v)))
batch += 1
#Scores
print("Training score: {:.4f}".format(lr.score(X_train_orig, y_train_orig)))
#RMSLE
rmsle = mean_squared_error(y_train_orig, lr.predict(X_train_orig))**0.5
print("RMSLE: {:.4f}".format(rmsle))
# Plotting the results
plt.scatter(lr.predict(X_train_orig), y_train_orig)
Out[25]:
In [26]:
# Gradient boosting
from sklearn import ensemble
params = {'n_estimators': 100, 'max_depth': X_train.shape[1], 'min_samples_split': 5,
'learning_rate': 0.1, 'loss': 'ls', 'random_state':9, 'warm_start':True}
gbr = ensemble.GradientBoostingRegressor(**params)
batch = 0
for train_idx, val_idx in kf.split(X_train, y_train):
X_t, X_v = X_train[train_idx], X_train[val_idx]
y_t, y_v = y_train[train_idx], y_train[val_idx]
#training
gbr.fit(X_t, y_t)
#calculate costs
t_error = mean_squared_error(y_t, gbr.predict(X_t))**0.5
v_error = mean_squared_error(y_v, gbr.predict(X_v))**0.5
print("{}) Training error: {:.2f} Validation error: {:.2f} Score: {:.2f}"
.format(batch, t_error, v_error, gbr.score(X_v, y_v)))
batch += 1
#Scores
print("Training score: {:.4f}".format(gbr.score(X_train_orig, y_train_orig)))
#RMSLE
rmsle = mean_squared_error(y_train_orig, gbr.predict(X_train_orig))**0.5
print("RMSLE: {:.4f}".format(rmsle))
# Plotting the results
plt.scatter(gbr.predict(X_train_orig), y_train_orig)
Out[26]:
In [27]:
# AdaBoost
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
abr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=X_train.shape[1]),
n_estimators=100, random_state=9)
batch = 0
for train_idx, val_idx in kf.split(X_train, y_train):
X_t, X_v = X_train[train_idx], X_train[val_idx]
y_t, y_v = y_train[train_idx], y_train[val_idx]
#training
abr.fit(X_t, y_t)
#calculate costs
t_error = mean_squared_error(y_t, abr.predict(X_t))**0.5
v_error = mean_squared_error(y_v, abr.predict(X_v))**0.5
print("{}) Training error: {:.2f} Validation error: {:.2f} Score: {:.2f}"
.format(batch, t_error, v_error, abr.score(X_v, y_v)))
batch += 1
#Scores
print("Training score: {:.4f}".format(abr.score(X_train_orig, y_train_orig)))
#RMSLE
rmsle = mean_squared_error(y_train_orig, abr.predict(X_train_orig))**0.5
print("RMSLE: {:.4f}".format(rmsle))
# Plotting the results
plt.scatter(abr.predict(X_train_orig), y_train_orig)
Out[27]:
In order to improve the previous predictions it's going to create a simple stacked model with the following architecture:
In [28]:
from sklearn.linear_model import LinearRegression
slr = LinearRegression()
sclr = preprocessing.StandardScaler()
def features_level1(X):
X0 = lr.predict(X)
X1 = gbr.predict(X)
X2 = abr.predict(X)
Xt = np.array([X0, X1, X2]).T
return sclr.fit_transform(Xt)
def stack_training(X, y):
slr.fit(features_level1(X), y)
def stack_predict(X):
return slr.predict(features_level1(X))
def stack_score(X, y):
return slr.score(features_level1(X), y)
#
batch = 0
kf = KFold(n_splits=3, random_state=9)
for train_idx, val_idx in kf.split(X_train, y_train):
X_t, X_v = X_train[train_idx], X_train[val_idx]
y_t, y_v = y_train[train_idx], y_train[val_idx]
#training
stack_training(X_t, y_t)
#calculate costs
t_error = mean_squared_error(y_t, stack_predict(X_t))**0.5
v_error = mean_squared_error(y_v, stack_predict(X_v))**0.5
print("{}) Training error: {:.2f} Validation error: {:.2f} Score: {:.2f}"
.format(batch, t_error, v_error, stack_score(X_v, y_v)))
batch += 1
rmsle = mean_squared_error(y_train_orig, stack_predict(X_train_orig))**0.5
print("RMSLE: {:.4f}".format(rmsle))
# Plotting the results
plt.scatter(stack_predict(X_train_orig), y_train_orig)
Out[28]:
In [29]:
def avg_predict(X):
return (lr.predict(X) + gbr.predict(X) + abr.predict(X))/3
predictions = avg_predict(X_train_orig)
RMSLE = mean_squared_error(y_train_orig, predictions)**0.5
print("RMSLE: {:.3f}".format(RMSLE))
# Plotting the results
plt.scatter(avg_predict(X_train_orig), y_train_orig)
Out[29]:
In [30]:
from sklearn.metrics import mean_squared_error
import random
RMSLE_lr = mean_squared_error(y_train, lr.predict(X_train))**0.5
RMSLE_gbr = mean_squared_error(y_train, gbr.predict(X_train))**0.5
RMSLE_abr = mean_squared_error(y_train, abr.predict(X_train))**0.5
RMSLE_avg = mean_squared_error(y_train, stack_predict(X_train))**0.5
RMSLE_stack = mean_squared_error(y_train, avg_predict(X_train))**0.5
print("RMSLE lr: {:.3f}".format(RMSLE_lr))
print("RMSLE gbr: {:.3f}".format(RMSLE_gbr))
print("RMSLE abr: {:.3f}".format(RMSLE_abr))
print("RMSLE average: {:.3f}".format(RMSLE_avg))
print("RMSLE stacked: {:.3f}".format(RMSLE_stack))
In [31]:
import os
predict = avg_predict(X_test)
#predict = stack_predict(X_test)
#predict = lr.predict(X_test)
#predictions are logs, return to the value
predict = np.exp(predict)
file = "Id,SalePrice" + os.linesep
startId = 1461
for i in range(len(X_test)):
file += "{},{}".format(startId, (int)(predict[i])) + os.linesep
startId += 1
#print(file)
In [32]:
# Save to file
with open('attempt.txt', 'w') as f:
f.write(file)
In [ ]: